1 Introduction

This workbook loads the lifebook data and creates a sample subpopulation of the data to be analysed in the rest of the project.

lifebook_tbl <- read_csv("data/dataset.csv", progress = FALSE) %>%
    mutate_if(is.character, as.factor)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   prem_freq = col_integer(),
##   prem_ape = col_double(),
##   prem_risk = col_double(),
##   policy_startdate = col_date(format = ""),
##   policy_enddate = col_date(format = ""),
##   policy_duration = col_integer(),
##   mort_rating = col_double(),
##   sum_assured = col_double(),
##   dob_life1 = col_date(format = ""),
##   isjointlife = col_logical(),
##   islifeonly = col_logical(),
##   policy_statuschangedate = col_date(format = ""),
##   lapsed = col_logical(),
##   subpop = col_logical()
## )
## See spec(...) for full column specifications.
glimpse(lifebook_tbl)
## Observations: 1,500,000
## Variables: 26
## $ policy_id               <fctr> C010000056, C010000063, C010000106, C010000141, C010...
## $ countyname              <fctr> Offaly County, Fingal, Cork County, Dublin City, Sou...
## $ edname                  <fctr> Ballyburly, Castleknock-Knockmaroon, Cobh Rural, Cab...
## $ nuts3name               <fctr> Midland, Dublin, South-West (IE), Dublin, Dublin, Du...
## $ sa_id                   <fctr> A187004004, A267040024, A047106023, A268030019, A267...
## $ cluster_id              <fctr> n6_c5, n6_c4, n6_c5, n6_c0, n6_c0, n6_c4, n6_c2, n6_...
## $ prod_type               <fctr> protection, pension, pension, protection, savings, p...
## $ prem_type               <fctr> RP, SP, RP, RP, SP, RP, RP, RP, RP, RP, RP, RP, RP, ...
## $ prem_freq               <int> 12, NA, 12, 12, NA, 12, 12, 12, 4, 12, 12, 12, 12, 12...
## $ prem_ape                <dbl> 303.27, 1249.95, 1183.20, 981.40, 600.00, 2289.99, 37...
## $ prem_risk               <dbl> 204.372, NA, NA, 738.679, NA, NA, NA, NA, NA, NA, NA,...
## $ policy_startdate        <date> 1990-01-02, 1990-01-02, 1990-01-02, 1990-01-02, 1990...
## $ policy_enddate          <date> 1995-01-02, 2063-03-24, 2071-07-15, 2010-01-02, 2000...
## $ policy_duration         <int> 5, NA, NA, 20, 10, NA, 20, NA, 20, 10, NA, 20, 10, 20...
## $ mort_rating             <dbl> 150, NA, NA, 100, NA, NA, NA, NA, NA, NA, NA, 100, 20...
## $ sum_assured             <dbl> 200000, NA, NA, 250000, NA, NA, NA, NA, NA, NA, NA, 4...
## $ dob_life1               <date> 1962-04-29, 1943-03-24, 1951-07-15, 1955-05-26, 1950...
## $ gender_life1            <fctr> F, M, M, M, M, M, F, F, M, M, M, M, M, M, M, M, M, F...
## $ smoker_life1            <fctr> Q, Q, N, N, S, N, N, Q, Q, N, N, N, S, S, N, S, Q, Q...
## $ isjointlife             <lgl> FALSE, NA, NA, FALSE, NA, NA, NA, NA, NA, NA, NA, FAL...
## $ islifeonly              <lgl> TRUE, NA, NA, FALSE, NA, NA, NA, NA, NA, NA, NA, TRUE...
## $ mortgage_status         <fctr> TERM, NA, NA, MORTDECR, NA, NA, NA, NA, NA, NA, NA, ...
## $ policy_status           <fctr> lapsed, lapsed, lapsed, lapsed, lapsed, lapsed, laps...
## $ policy_statuschangedate <date> 1991-03-02, 1998-12-02, 1997-09-02, 1990-07-02, 1998...
## $ lapsed                  <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,...
## $ subpop                  <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...

Having loaded the data, we now separate the data into four categories: logical, numeric, categorical, and date/time.

drop_var <- c('policy_id', 'edname', 'sa_id')
subpop_var <- 'subpop'

subpop_vals <- lifebook_tbl[[subpop_var]] == TRUE
subpop_tbl  <- lifebook_tbl %>% filter(subpop_vals)

datatypes_tbl <- lifebook_tbl %>%
    select(-one_of(c(subpop_var, drop_var))) %>%
    summarise_each(funs(class)) %>%
    gather('variable','datatype')

var_types <- datatypes_tbl %>%
    .[['datatype']] %>%
    unique

gen_list <- lapply(var_types
                   ,function(x) datatypes_tbl %>% filter(datatype == x) %>% .[['variable']])

names(gen_list) <- var_types

type_list <- list(
    categorical = c(gen_list$factor, gen_list$logical)
   ,numeric     = c(gen_list$numeric, gen_list$integer)
   ,datetime    = gen_list$Date
)

2 Initial Visualisations

We first create some simple plots based on the indicator.

ggplot(lifebook_tbl) +
    geom_bar(aes(x = subpop)) +
    scale_y_continuous(labels = comma) +
    ylab("Count")

3 Facet Plots

facet_formula <- formula(paste0("~", subpop_var))

3.1 Categorical Variables

for(plot_var in type_list$categorical) {
    cat(paste0("Plot Var: ", plot_var, "\n"))
    
    var_plot <- ggplot(lifebook_tbl) +
        geom_bar(aes_string(x = plot_var)) +
        facet_wrap(facet_formula, scales = 'free') +
        scale_y_continuous(labels = comma) +
        xlab(plot_var) +
        ylab("Count") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

    print(var_plot)
}
## Plot Var: countyname

## Plot Var: nuts3name

## Plot Var: cluster_id

## Plot Var: prod_type

## Plot Var: prem_type

## Plot Var: gender_life1

## Plot Var: smoker_life1

## Plot Var: mortgage_status

## Plot Var: policy_status

## Plot Var: isjointlife

## Plot Var: islifeonly

## Plot Var: lapsed

3.2 Numeric Variables

for(plot_var in type_list$numeric) {
    cat(paste0("Plot Var: ", plot_var, "\n"))
    
    var_plot <- ggplot(lifebook_tbl) +
        geom_histogram(aes_string(x = plot_var), bins = 50) +
        facet_wrap(facet_formula, scales = 'free') +
        scale_x_continuous(labels = comma) +
        scale_y_continuous(labels = comma) +
        xlab(plot_var) +
        ylab("Count")

    print(var_plot)
}
## Plot Var: prem_ape

## Plot Var: prem_risk
## Warning: Removed 727307 rows containing non-finite values (stat_bin).

## Plot Var: mort_rating
## Warning: Removed 727307 rows containing non-finite values (stat_bin).

## Plot Var: sum_assured
## Warning: Removed 727307 rows containing non-finite values (stat_bin).

## Plot Var: prem_freq
## Warning: Removed 233199 rows containing non-finite values (stat_bin).

## Plot Var: policy_duration
## Warning: Removed 475193 rows containing non-finite values (stat_bin).

3.3 Date/Time Variables

for(plot_var in type_list$datetime) {
    cat(paste0("Plot Var: ", plot_var, "\n"))
    
    var_plot <- ggplot(lifebook_tbl) +
        geom_histogram(aes_string(x = plot_var), bins = 50) +
        facet_wrap(facet_formula, scales = 'free') +
        scale_x_date(date_labels = '%Y-%m-%d') +
        scale_y_continuous(labels = comma) +
        xlab(plot_var) +
        ylab("Count") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
        

    print(var_plot)
}
## Plot Var: policy_startdate

## Plot Var: policy_enddate

## Plot Var: dob_life1

## Plot Var: policy_statuschangedate

4 Bootstrap Checks

This approach compares the data in the subpopulation against a number of bootstrap samples from the full dataset, and then plots the two against each other.

bootstrap_count <- 250

calc_bootstrap_stats <- function(x, b) {
    if(missing(b)) b <- seq_along(x)
    
    use_x     <- x[b]
    samp_mean <- mean(use_x)
    samp_perc <- quantile(use_x
                         ,type = 1
                         ,probs = c(0.50, 0.01, 0.10, 0.25, 0.75, 0.90, 0.99))

    return(c(mean = samp_mean, samp_perc))    
}

generate_bootstrap_props <- function(data_tbl, cat_varname, count) {
    bs_tbl <- data_tbl %>%
        sample_n(count) %>%
        group_by_(cat_varname) %>%
        summarise(count = n()) %>%
        mutate(prop = count / sum(count))
    
    return(bs_tbl)
}

4.1 Categorical Variables

Dealing with the bootstrap for categorical variables is not obvious to me.

for(plot_var in type_list$categorical) {
    subpop_prop_tbl <- subpop_tbl %>%
        group_by_(plot_var) %>%
        summarise(count = n()) %>%
        mutate(prop = count / sum(count)
              ,idx  = 1) %>%
        arrange_(plot_var)

    catprop_lst <- list()
    
    for(i in 1:bootstrap_count) {
        catprop_lst[[i]] <- lifebook_tbl %>%
            generate_bootstrap_props(plot_var, subpop_tbl %>% nrow) %>%
            mutate(idx = i)
    }
    
    bootprops_tbl <- catprop_lst %>% bind_rows()
    rm(catprop_lst)
    
    cat_plot <- ggplot(bootprops_tbl) +
        geom_line(aes_string(x = plot_var, y = 'prop', group = 'idx')
                 ,alpha = 0.1) +
        geom_line(aes_string(x = plot_var, y = 'prop', group = 1)
                 ,data = subpop_prop_tbl
                 ,colour = 'red') +
        expand_limits(y = 0) +
        xlab(plot_var) +
        ylab("Proportion") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
    
    print(cat_plot)
}

4.2 Numeric Variables

We take bootstrap samples of the full dataset, calculate the statistics and compare them to the corresponding statistic in the subpopulation.

for(plot_var in type_list$numeric) {
    cat(paste0("Plot Var: ", plot_var, "\n"))

    var_vals <- lifebook_tbl %>% .[[plot_var]]
    var_vals <- var_vals[!is.na(var_vals)]

    subpop_stats_tbl <- var_vals %>%
        calc_bootstrap_stats %>%
        t %>%
        as_data_frame %>%
        gather('variable','value')

    data_boot <- boot(var_vals, calc_bootstrap_stats, R = bootstrap_count)    
    
    boot_tbl        <- data_boot$t %>% as_data_frame
    names(boot_tbl) <- subpop_stats_tbl$variable
    
    bootplot_tbl <- boot_tbl %>%
        mutate(iter = 1:n()) %>%
        gather('variable','value',-iter)

    var_plot <- ggplot(bootplot_tbl) +
        geom_density(aes(x = value)) +
        geom_vline(aes(xintercept = value), data = subpop_stats_tbl, colour = 'red') +
        facet_wrap(~variable, scales = 'free') +
        scale_x_continuous(labels = comma) +
        xlab(plot_var)
        
    print(var_plot)
}
## Plot Var: prem_ape

## Plot Var: prem_risk

## Plot Var: mort_rating

## Plot Var: sum_assured

## Plot Var: prem_freq

## Plot Var: policy_duration

4.3 Date/Time Variables

for(plot_var in type_list$datetime) {
    cat(paste0("Plot Var: ", plot_var, "\n"))

    var_vals <- lifebook_tbl %>% .[[plot_var]]
    var_vals <- var_vals[!is.na(var_vals)]

    stat_vals <- var_vals %>% calc_bootstrap_stats

    subpop_stats_tbl <- data_frame(variable = names(stat_vals)
                                  ,value    = stat_vals)
    
    data_boot <- boot(var_vals, calc_bootstrap_stats, R = bootstrap_count)    
    
    boot_tbl        <- data_boot$t %>% as_data_frame
    names(boot_tbl) <- subpop_stats_tbl$variable
    
    bootplot_tbl <- boot_tbl %>%
        mutate(iter = 1:n()) %>%
        gather('variable','value',-iter) %>%
        mutate(value = as.Date(value, origin = '1970-01-01'))

    var_plot <- ggplot(bootplot_tbl) +
        geom_histogram(aes(x = value), bins = 50) +
        geom_vline(aes(xintercept = as.numeric(value)), data = subpop_stats_tbl, colour = 'red') +
        facet_wrap(~variable, scales = 'free') +
        scale_x_date(date_labels = '%Y-%m-%d') +
        xlab(plot_var) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
        
        
    print(var_plot)
}
## Plot Var: policy_startdate

## Plot Var: policy_enddate

## Plot Var: dob_life1

## Plot Var: policy_statuschangedate